diabetesIndicators <- read_excel("diabetesIndicators.xlsx")
data <- diabetesIndicators %>%
  mutate(diabetesTF = as.numeric(ifelse(Diabetes_012 > 1, 1, 0)))

dataM <- as.matrix(data)
summary(data)
##   Diabetes_012        HighBP         HighChol        CholCheck     
##  Min.   :0.0000   Min.   :0.000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.000   1st Qu.:0.0000   1st Qu.:1.0000  
##  Median :0.0000   Median :0.000   Median :0.0000   Median :1.0000  
##  Mean   :0.2969   Mean   :0.429   Mean   :0.4241   Mean   :0.9627  
##  3rd Qu.:0.0000   3rd Qu.:1.000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :2.0000   Max.   :1.000   Max.   :1.0000   Max.   :1.0000  
##       BMI            Smoker           Stroke        HeartDiseaseorAttack
##  Min.   :12.00   Min.   :0.0000   Min.   :0.00000   Min.   :0.00000     
##  1st Qu.:24.00   1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.00000     
##  Median :27.00   Median :0.0000   Median :0.00000   Median :0.00000     
##  Mean   :28.38   Mean   :0.4432   Mean   :0.04057   Mean   :0.09419     
##  3rd Qu.:31.00   3rd Qu.:1.0000   3rd Qu.:0.00000   3rd Qu.:0.00000     
##  Max.   :98.00   Max.   :1.0000   Max.   :1.00000   Max.   :1.00000     
##   PhysActivity        Fruits          Veggies       HvyAlcoholConsump
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   
##  1st Qu.:1.0000   1st Qu.:0.0000   1st Qu.:1.0000   1st Qu.:0.0000   
##  Median :1.0000   Median :1.0000   Median :1.0000   Median :0.0000   
##  Mean   :0.7565   Mean   :0.6343   Mean   :0.8114   Mean   :0.0562   
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:0.0000   
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   
##  AnyHealthcare     NoDocbcCost         GenHlth         MentHlth     
##  Min.   :0.0000   Min.   :0.00000   Min.   :1.000   Min.   : 0.000  
##  1st Qu.:1.0000   1st Qu.:0.00000   1st Qu.:2.000   1st Qu.: 0.000  
##  Median :1.0000   Median :0.00000   Median :2.000   Median : 0.000  
##  Mean   :0.9511   Mean   :0.08418   Mean   :2.511   Mean   : 3.185  
##  3rd Qu.:1.0000   3rd Qu.:0.00000   3rd Qu.:3.000   3rd Qu.: 2.000  
##  Max.   :1.0000   Max.   :1.00000   Max.   :5.000   Max.   :30.000  
##     PhysHlth         DiffWalk           Sex              Age        
##  Min.   : 0.000   Min.   :0.0000   Min.   :0.0000   Min.   : 1.000  
##  1st Qu.: 0.000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.: 6.000  
##  Median : 0.000   Median :0.0000   Median :0.0000   Median : 8.000  
##  Mean   : 4.242   Mean   :0.1682   Mean   :0.4403   Mean   : 8.032  
##  3rd Qu.: 3.000   3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:10.000  
##  Max.   :30.000   Max.   :1.0000   Max.   :1.0000   Max.   :13.000  
##    Education        Income        diabetesTF    
##  Min.   :1.00   Min.   :1.000   Min.   :0.0000  
##  1st Qu.:4.00   1st Qu.:5.000   1st Qu.:0.0000  
##  Median :5.00   Median :7.000   Median :0.0000  
##  Mean   :5.05   Mean   :6.054   Mean   :0.1393  
##  3rd Qu.:6.00   3rd Qu.:8.000   3rd Qu.:0.0000  
##  Max.   :6.00   Max.   :8.000   Max.   :1.0000

We did not remove any of the data in this data set because there were no missing values or invalid entries that would prohibit an effective analysis. We did add a variable diabetesTF to represent our response variable in a binary form rather than 0, 1 or 2. Our response variable, diabetes_012 represents if someone doesn’t have diabetes, has pre-diabetes or has diabetes. The age and income variables are in numerical groups which represent ranges, but these are the closest variables we have to categorical variables so did not need to use dummy variables.

#{r} #for(i in 1:23) { # hist(dataM[,i], xlab = colnames(dataM)[i]) #} #

for(i in 1:23)
{
  plot(sort(dataM[,i]), ylab = colnames(data[,i]))
}

for(i in 1:23) {

print(colnames(data[,i]))  
mean <- mean(dataM[,i])
sum_of_squares <- sum((dataM[,i] - mean)^2)
print(paste("Mean     : ", mean, sep = ""))
print(paste("Sum'O'^2 : ", sum_of_squares, sep = ""))
print(paste("Variance : ", var(dataM[,i]), sep = ""))
print(paste("Stdev    : ", sd(dataM[,i]), sep = ""))
print(paste("Kurtosis : ", kurtosis(dataM[,i]), sep = ""))
print(paste("Skewness : ", skewness(dataM[,i]), sep = ""))
cat("\n")
}
## [1] "Diabetes_012"
## [1] "Mean     : 0.296921318196153"
## [1] "Sum'O'^2 : 123649.995549512"
## [1] "Variance : 0.487427006372272"
## [1] "Stdev    : 0.698159728409102"
## [1] "Kurtosis : 4.98008508944251"
## [1] "Skewness : 1.97637872974942"
## 
## [1] "HighBP"
## [1] "Mean     : 0.429001103752759"
## [1] "Sum'O'^2 : 62141.238879691"
## [1] "Variance : 0.244960122358142"
## [1] "Stdev    : 0.494934462689902"
## [1] "Kurtosis : 1.0823132041371"
## [1] "Skewness : 0.286902778196897"
## 
## [1] "HighChol"
## [1] "Mean     : 0.424120939766635"
## [1] "Sum'O'^2 : 61959.403969568"
## [1] "Variance : 0.244243331018996"
## [1] "Stdev    : 0.494209804656885"
## [1] "Kurtosis : 1.09429374311925"
## [1] "Skewness : 0.307072862883141"
## 
## [1] "CholCheck"
## [1] "Mean     : 0.962669504888048"
## [1] "Sum'O'^2 : 9116.48021128979"
## [1] "Variance : 0.0359370709096527"
## [1] "Stdev    : 0.189570754362725"
## [1] "Kurtosis : 24.8265288927897"
## [1] "Skewness : -4.88124255623399"
## 
## [1] "BMI"
## [1] "Mean     : 28.3823636076947"
## [1] "Sum'O'^2 : 11079389.4947808"
## [1] "Variance : 43.6748390476974"
## [1] "Stdev    : 6.60869420140601"
## [1] "Kurtosis : 13.997232885003"
## [1] "Skewness : 2.12199121094459"
## 
## [1] "Smoker"
## [1] "Mean     : 0.443168558814254"
## [1] "Sum'O'^2 : 62600.6611124251"
## [1] "Variance : 0.246771160058283"
## [1] "Stdev    : 0.496760666778563"
## [1] "Kurtosis : 1.05235336962997"
## [1] "Skewness : 0.228808587316943"
## 
## [1] "Stroke"
## [1] "Mean     : 0.0405707978555661"
## [1] "Sum'O'^2 : 9874.44534847052"
## [1] "Variance : 0.0389249616581212"
## [1] "Stdev    : 0.197294099400162"
## [1] "Kurtosis : 22.690556891815"
## [1] "Skewness : 4.65731219608638"
## 
## [1] "HeartDiseaseorAttack"
## [1] "Mean     : 0.0941855881425418"
## [1] "Sum'O'^2 : 21642.6237425102"
## [1] "Variance : 0.0853149994383067"
## [1] "Stdev    : 0.292087314750755"
## [1] "Kurtosis : 8.72131452351243"
## [1] "Skewness : 2.77872534150327"
## 
## [1] "PhysActivity"
## [1] "Mean     : 0.756543677073478"
## [1] "Sum'O'^2 : 46724.1374960581"
## [1] "Variance : 0.184186067810336"
## [1] "Stdev    : 0.429169043397046"
## [1] "Kurtosis : 2.4293137036805"
## [1] "Skewness : -1.19553908496565"
## 
## [1] "Fruits"
## [1] "Mean     : 0.634255755282245"
## [1] "Sum'O'^2 : 58847.5174865974"
## [1] "Variance : 0.231976306618196"
## [1] "Stdev    : 0.481639187170434"
## [1] "Kurtosis : 1.31080206667641"
## [1] "Skewness : -0.557496248127656"
## 
## [1] "Veggies"
## [1] "Mean     : 0.811419899085462"
## [1] "Sum'O'^2 : 38817.5165523494"
## [1] "Variance : 0.153018249647584"
## [1] "Stdev    : 0.39117547168449"
## [1] "Kurtosis : 3.53519396733912"
## [1] "Skewness : -1.59222924459361"
## 
## [1] "HvyAlcoholConsump"
## [1] "Mean     : 0.0561967833491012"
## [1] "Sum'O'^2 : 13454.8586565752"
## [1] "Variance : 0.0530389139683428"
## [1] "Stdev    : 0.230301788895229"
## [1] "Kurtosis : 15.8541556975799"
## [1] "Skewness : 3.85410893691135"
## 
## [1] "AnyHealthcare"
## [1] "Mean     : 0.951052507095553"
## [1] "Sum'O'^2 : 11809.2189806055"
## [1] "Variance : 0.0465518193488837"
## [1] "Stdev    : 0.215758706310739"
## [1] "Kurtosis : 18.4815222256971"
## [1] "Skewness : -4.18109103293591"
## 
## [1] "NoDocbcCost"
## [1] "Mean     : 0.0841769157994324"
## [1] "Sum'O'^2 : 19556.4861400189"
## [1] "Variance : 0.0770914665384952"
## [1] "Stdev    : 0.277653500857625"
## [1] "Kurtosis : 9.97165544892485"
## [1] "Skewness : 2.9952721827782"
## 
## [1] "GenHlth"
## [1] "Mean     : 2.51139230526648"
## [1] "Sum'O'^2 : 289611.07623778"
## [1] "Variance : 1.14164387370567"
## [1] "Stdev    : 1.06847736228039"
## [1] "Kurtosis : 2.61672250326385"
## [1] "Skewness : 0.42286437471248"
## 
## [1] "MentHlth"
## [1] "Mean     : 3.18477215389467"
## [1] "Sum'O'^2 : 13939736.1748305"
## [1] "Variance : 54.9502961413064"
## [1] "Stdev    : 7.41284669619617"
## [1] "Kurtosis : 9.44153386829319"
## [1] "Skewness : 2.72113227559153"
## 
## [1] "PhysHlth"
## [1] "Mean     : 4.24208057395144"
## [1] "Sum'O'^2 : 19280282.5898731"
## [1] "Variance : 76.0026749942765"
## [1] "Stdev    : 8.71795130717513"
## [1] "Kurtosis : 6.49614924384468"
## [1] "Skewness : 2.20738186293672"
## 
## [1] "DiffWalk"
## [1] "Mean     : 0.168223746452223"
## [1] "Sum'O'^2 : 35496.0516201514"
## [1] "Variance : 0.139925069162806"
## [1] "Stdev    : 0.374065594732803"
## [1] "Kurtosis : 4.14671036414608"
## [1] "Skewness : 1.77389694293273"
## 
## [1] "Sex"
## [1] "Mean     : 0.440342163355408"
## [1] "Sum'O'^2 : 62517.1383002209"
## [1] "Variance : 0.24644191399454"
## [1] "Stdev    : 0.496429163118506"
## [1] "Kurtosis : 1.05776730825032"
## [1] "Skewness : 0.240348306110786"
## 
## [1] "Age"
## [1] "Mean     : 8.03211920529801"
## [1] "Sum'O'^2 : 2366384.29271523"
## [1] "Variance : 9.32826246049232"
## [1] "Stdev    : 3.05422043416848"
## [1] "Kurtosis : 2.41876507648057"
## [1] "Skewness : -0.359901119768877"
## 
## [1] "Education"
## [1] "Mean     : 5.05043361715547"
## [1] "Sum'O'^2 : 246512.752302113"
## [1] "Variance : 0.97175072553153"
## [1] "Stdev    : 0.985774175727651"
## [1] "Kurtosis : 3.03942873761709"
## [1] "Skewness : -0.777250674664268"
## 
## [1] "Income"
## [1] "Mean     : 6.05387496058026"
## [1] "Sum'O'^2 : 1088194.69091375"
## [1] "Variance : 4.28965224127243"
## [1] "Stdev    : 2.07114756627152"
## [1] "Kurtosis : 2.71965261165786"
## [1] "Skewness : -0.891339720170308"
## 
## [1] "diabetesTF"
## [1] "Mean     : 0.139333017975402"
## [1] "Sum'O'^2 : 30421.1351466414"
## [1] "Variance : 0.119919800798022"
## [1] "Stdev    : 0.346294384589213"
## [1] "Kurtosis : 5.33893931890334"
## [1] "Skewness : 2.08301207843434"

From the information above, we noticed CholCheck, BMI, Stroke, HeartDiseaseorAttack, HvyAlcoholConsump, AnyHealthcare, NoDocbcCost, MentalHealth, PhyHealth are fairly skewed and this is likely because many are binary variables. This could also be because most of the population does or does not experience some of these things, as an example, most don’t have a stroke in their life. On the other hand Education seems to be well distributed, it has a kurtosis of close to 3. We also found BMI interesting because it is a percentile of smallest BMIs, so the value represents the percent of those who have a larger BMI than the individual.

We don’t need to do transformations because our response variable has 3 numerical options, we will likely be using either diabetes_012 or diabetesTF as our response variables and the options of values for these are 0, 1 or 2 and 0 or 1, respectively.

——————————TRANSFORMATION AMENDMENT———————————

We are unable to transform some variables because they are binary. We were able to transform BMI, Income, GenHlth and Education. Some of the transformations we tried were taking the log, squaring (sometimes twice), taking the square root or taking the reciprocal. Some variables like MentHlth and PhyHlth had no improvement after transformation which can be seen below. Age was the only non-binary variable which was not significantly skewed.

var <- diabetesIndicators$BMI
skewness(var)
## [1] 2.121991
hist(var ,  xlab = "var")

skewness(1/var)
## [1] 0.1488329
hist(1/var ,  xlab = "reciprocal")

skewness(sqrt(var))
## [1] 1.277831
hist(sqrt(var) ,  xlab = "square root")

skewness(log(var))
## [1] 0.6900795
hist(log(var) ,  xlab = "log")

skewness(var^2)
## [1] 5.33314
hist(var^2 ,  xlab = "square")

var <- diabetesIndicators$Income
skewness(var)
## [1] -0.8913397
hist(var ,  xlab = "var")

skewness(1/var)
## [1] 3.287419
hist(1/var ,  xlab = "reciprocal")

skewness(sqrt(var))
## [1] -1.292637
hist(sqrt(var) ,  xlab = "square root")

skewness(log(var))
## [1] -1.852676
hist(log(var) ,  xlab = "log")

skewness(var^4)
## [1] 0.06638637
hist(var^4 ,  xlab = "square")

var <- diabetesIndicators$GenHlth
skewness(var)
## [1] 0.4228644
hist(var ,  xlab = "var")

skewness(1/var)
## [1] 1.138242
hist(1/var ,  xlab = "reciprocal")

skewness(sqrt(var))
## [1] -0.02544915
hist(sqrt(var) ,  xlab = "square root")

skewness(log(var))
## [1] -0.4584611
hist(log(var) ,  xlab = "log")

skewness(var^2)
## [1] 1.264296
hist(var^2 ,  xlab = "square")

var <- diabetesIndicators$Education
skewness(var)
## [1] -0.7772507
hist(var ,  xlab = "var")

skewness(1/var)
## [1] 3.746471
hist(1/var ,  xlab = "reciprocal")

skewness(sqrt(var))
## [1] -1.098696
hist(sqrt(var) ,  xlab = "square root")

skewness(log(var))
## [1] -1.584198
hist(log(var) ,  xlab = "log")

skewness(var^4)
## [1] -0.03302759
hist(var^4 ,  xlab = "square")

var <- diabetesIndicators$PhysHlth+1
skewness(var)
## [1] 2.207382
hist(var ,  xlab = "var")

skewness(1/var)
## [1] -0.6929417
hist(1/var ,  xlab = "reciprocal")

skewness(sqrt(var))
## [1] 1.757515
hist(sqrt(var) ,  xlab = "square root")

skewness(log(var))
## [1] 1.247831
hist(log(var) ,  xlab = "log")

skewness(var^2)
## [1] 2.691491
hist(var^2 ,  xlab = "square")

var <- diabetesIndicators$MentHlth+1
skewness(var)
## [1] 2.721132
hist(var ,  xlab = "var")

skewness(1/var)
## [1] -0.9693661
hist(1/var ,  xlab = "reciprocal")

skewness(sqrt(var))
## [1] 2.116762
hist(sqrt(var) ,  xlab = "square root")

skewness(log(var))
## [1] 1.521961
hist(log(var) ,  xlab = "log")

skewness(var^2)
## [1] 3.457319
hist(var^2 ,  xlab = "square")

We are doing a reciprocal transformation for BMI, a square root transformation for GenHlth and squaring Income and Education twice (to the fourth power).

dataT <- data
dataT$BMI <- 1/data$BMI
dataT$Income <- data$Income^4
dataT$GenHlth <- sqrt(data$GenHlth)
dataT$Education <- data$Education^4

————————————END—————————————

cov_matrix <- cov(dataM)
sum_squares_matrix <- cov_matrix * (253680 - 1)
cor_matrix <- cor(dataM)
corrplot(cov_matrix, method = "shade",is.corr = FALSE, type = "upper", tl.pos = "t")
corrplot(cor_matrix, method = "shade", type = "lower", add = TRUE, tl.pos = "l")

Looking at our heat map, the bottom half are the correlations and the top half s the covariances. There is little covariance among the variables. However, looking at correlation, there is some weaker correlation between many variables and very few strong relationships between variables as the vast majority of the correlations were below 0.3. For both the covariance and correlation, the only relationship to watch for multicollinearity is between general health and physical health.

qn <- qnorm(1 - 0.01/2)
Z <- scale(dataM) 

dataNA <- dataM
dataNA[which(Z > qn| Z < (-1*qn))] <- NA
#corNA <- cor(dataNA[, c(1:3,5:6,9:11,15:23)], use = "pairwise.complete.obs")
corNA <- cor(dataNA[,], use = "pairwise.complete.obs")
## Warning in cor(dataNA[, ], use = "pairwise.complete.obs"): the standard
## deviation is zero
corrplot(cor_matrix, method = "shade")

corrplot(corNA, method = "shade")

corNANA <- dataNA[,c(4,7,8,12,13,14)]
summary(corNANA)
##    CholCheck        Stroke      HeartDiseaseorAttack HvyAlcoholConsump
##  Min.   :1      Min.   :0       Min.   :0            Min.   :0        
##  1st Qu.:1      1st Qu.:0       1st Qu.:0            1st Qu.:0        
##  Median :1      Median :0       Median :0            Median :0        
##  Mean   :1      Mean   :0       Mean   :0            Mean   :0        
##  3rd Qu.:1      3rd Qu.:0       3rd Qu.:0            3rd Qu.:0        
##  Max.   :1      Max.   :0       Max.   :0            Max.   :0        
##  NA's   :9470   NA's   :10292   NA's   :23893        NA's   :14256    
##  AnyHealthcare    NoDocbcCost   
##  Min.   :1       Min.   :0      
##  1st Qu.:1       1st Qu.:0      
##  Median :1       Median :0      
##  Mean   :1       Mean   :0      
##  3rd Qu.:1       3rd Qu.:0      
##  Max.   :1       Max.   :0      
##  NA's   :12417   NA's   :21354

When comparing the correlations before and after outliers were removed, there are a few things that have changed. Some binary variables have become significantly less meaningful because they now only have one value. Additionally there is a cluster of GenHlth, MentHlth, PhysHlth and DiffWalk which seemed relatively correlated before outliers were removed and after outliers were taken out seems less correlated. Overall, taking out outliers at this point seems to just lead to data loss for us.

———————————–MODIFIED OUTLIERS————————————

lim <- qchisq(.95,2)
dataMHold <- dataM

for(i in 1:21){
  for(j in (i+1):22){
    dat <- dataM[,c(i,j)]
    test <- mahalanobis(dat, center = c(mean(dataM[,i]), mean(dataM[,1])), cov = cov(dat))
    dataMHold[which(test > lim), c(1,i)] <- NA
  }
}

corrplot(cor_matrix, method = "shade")

corrplot(cor(dataMHold, use = "pairwise.complete.obs"), method = "shade")
## Warning in cor(dataMHold, use = "pairwise.complete.obs"): the standard deviation
## is zero

corNANA <- dataMHold[,c(4,7,8,12,13,14)]
summary(corNANA)
##    CholCheck          Stroke       HeartDiseaseorAttack HvyAlcoholConsump
##  Min.   : NA      Min.   :0        Min.   :0            Min.   :0        
##  1st Qu.: NA      1st Qu.:0        1st Qu.:0            1st Qu.:0        
##  Median : NA      Median :0        Median :0            Median :0        
##  Mean   :NaN      Mean   :0        Mean   :0            Mean   :0        
##  3rd Qu.: NA      3rd Qu.:0        3rd Qu.:0            3rd Qu.:0        
##  Max.   : NA      Max.   :0        Max.   :0            Max.   :0        
##  NA's   :253680   NA's   :253643   NA's   :253643       NA's   :253641   
##  AnyHealthcare     NoDocbcCost    
##  Min.   :1        Min.   :0       
##  1st Qu.:1        1st Qu.:0       
##  Median :1        Median :0       
##  Mean   :1        Mean   :0       
##  3rd Qu.:1        3rd Qu.:0       
##  Max.   :1        Max.   :0       
##  NA's   :253598   NA's   :253560

Taking out outliers according to 2 dimensional outliers seems to have a similar effect when we were taking out one dimensional outliers except much more extreme. We no longer have many of our correlations and this is because most of our variables are binary or categorical. When outliers are taken out, many of our variables become solely one value. We summarized some variables to show this. Outliers may be influential for this data, but taking outliers out makes our data less meaningful so we won’t be taking them out.